---
title: "Online Appendix: What Does It Take to Be Rich?"
header-includes:
   - \usepackage{setspace}\doublespacing
   - \setlength\parindent{24pt}
   - \usepackage{pdflscape}
   - \newcommand{\blandscape}{\begin{landscape}}
   - \newcommand{\elandscape}{\end{landscape}}
output: 
  pdf_document:
    number_sections: false
    fig_caption: yes
    toc: true
fontsize: 12pt
bibliography: whoisrich.bib 
---

```{r setup, include=FALSE, warning=F}
knitr::opts_chunk$set(echo = F)
# r version 4.2.3 

library(tidyverse)
library(ggplot2)
library(haven)
library(foreign)
library(purrr)
library(kableExtra)
library(gridExtra)

#set number of digits
options(digits=2)

#load survey data pilot_openended_clean
pilot <- read.csv("pilot.csv", na.strings="NA")
respondi <- read.csv("respondi.csv", na.strings="NA")
yougov <- read_sav("main.sav")

#load actual income distribution data
de18 <- read.csv("de18.csv")
us19 <- read.csv("us19.csv")


#winsorizing for analyses
#method 1 - reference real income distribution
#if rich threshold is higher than the 99th percentile, winsorize to 99th percentile

#method 2 (topcoded_2) - reference only respondent guesses
#recode 5% most extreme responses on top and bottom to the 5th and 95th percentile responses
pilot <- pilot %>%
  mutate(rich_topcoded = ifelse(inc_rich_clean>us19[us19$percentiles==99,"us19_quantiles"], us19[us19$percentiles==99,"us19_quantiles"], inc_rich_clean),
         rich_topcoded_2 = ifelse(inc_rich_clean>quantile(pilot$inc_rich_clean, na.rm=T, probs=0.95), quantile(pilot$inc_rich_clean, na.rm=T, probs=0.95), inc_rich_clean)
  )

respondi <- respondi %>%
  mutate(rich_topcoded = ifelse(rich_inc>de18[de18$percentile==99,"de18_quantile"], de18[de18$percentile==99,"de18_quantile"], rich_inc),
         rich_topcoded_2 = ifelse(rich_inc>quantile(respondi$rich_inc, na.rm=T, probs=0.95), quantile(respondi$rich_inc, na.rm=T, probs=0.95), rich_inc),
         poor_topcoded = ifelse(poor_inc>de18[de18$percentile==99,"de18_quantile"], de18[de18$percentile==99,"de18_quantile"], poor_inc),
         poor_topcoded_2 = ifelse(poor_inc>quantile(respondi$poor_inc, na.rm=T, probs=0.95), quantile(respondi$poor_inc, na.rm=T, probs=0.95), poor_inc),
         rich_poor_ratio = rich_topcoded/poor_topcoded
  )

yougov <- yougov %>%
  mutate(faminc_new = na_if(faminc_new,97),
         rich_topcoded = ifelse(Q1>us19[us19$percentiles==99,"us19_quantiles"], us19[us19$percentiles==99,"us19_quantiles"], Q1),
         poor_topcoded = ifelse(Q2>us19[us19$percentiles==99,"us19_quantiles"], us19[us19$percentiles==99,"us19_quantiles"], Q2),
         rich_topcoded_2 = ifelse(Q1>quantile(yougov$Q1, na.rm=T, probs=0.95), quantile(yougov$Q1, na.rm=T, probs=0.95), Q1),
         poor_topcoded_2 = ifelse(Q2>quantile(yougov$Q2, na.rm=T, probs=0.95), quantile(yougov$Q2, na.rm=T, probs=0.95), Q2),
         rich_poor_ratio = rich_topcoded/poor_topcoded
  )



#give each estimate a percentile
yougov <- yougov %>% add_column(rich_percentile = NA)
yougov <- yougov %>% add_column(poor_percentile = NA)
respondi <- respondi %>% add_column(rich_percentile = NA)
respondi <- respondi %>% add_column(poor_percentile = NA)

for (i in 1:nrow(yougov)) {
  yougov$rich_percentile[i] <- sum(yougov$Q1[i]>us19$us19_quantiles)
}

for (i in 1:nrow(yougov)) {
  yougov$poor_percentile[i] <- sum(yougov$Q2[i]>us19$us19_quantiles)
}

for (i in 1:nrow(respondi)) {
  respondi$rich_percentile[i] <- sum(respondi$rich_inc[i]>de18$de18_quantile)
}

for (i in 1:nrow(respondi)) {
  respondi$poor_percentile[i] <- sum(respondi$poor_inc[i]>de18$de18_quantile)
}


#give each estimate a decile
#first find deciles
us19_deciles <- us19[c(10,20,30,40,50,60,70,80,90),"us19_quantiles"]
de18_deciles <- de18[c(10,20,30,40,50,60,70,80,90),"de18_quantile"]

#then match as for percentiles
yougov <- yougov %>% add_column(rich_decile = NA)
yougov <- yougov %>% add_column(poor_decile = NA)
respondi <- respondi %>% add_column(rich_decile = NA)
respondi <- respondi %>% add_column(poor_decile = NA)

for (i in 1:length(yougov$rich_topcoded)) {
  yougov$rich_decile[i] <- (sum(yougov$rich_topcoded[i]>=us19_deciles))*10
}

for (i in 1:length(yougov$poor_topcoded)) {
  yougov$poor_decile[i] <- (sum(yougov$poor_topcoded[i]>=us19_deciles))*10
}

for (i in 1:length(respondi$rich_topcoded)) {
  respondi$rich_decile[i] <- (sum(respondi$rich_topcoded[i]>=de18_deciles))*10
}

for (i in 1:length(respondi$poor_topcoded)) {
  respondi$poor_decile[i] <- (sum(respondi$poor_topcoded[i]>=de18_deciles))*10
}


#and reverse question: for each percentile, what share of respondents think a person above that cutoff is rich? 
de18 %>% add_column(cumul_rich_respondi = NA)
for (i in 1:100){
  de18$cumul_rich_respondi[i] <- sum(respondi$rich_inc<=de18$de18_quantile[i], na.rm=T)/sum(is.na(respondi$rich_inc)!=1)
}
#include those who estimated over the 100% cut-off in the last percentile count
de18$cumul_rich_respondi[100] <- 1

us19 %>% add_column(cumul_rich_yougov = NA)
for (i in 1:100){
  us19$cumul_rich_yougov[i] <- sum(yougov$Q1<=us19$us19_quantile[i], na.rm=T)/sum(is.na(yougov$Q1)!=1)
}
#include those who estimated over the 100% cut-off in the last percentile count
us19$cumul_rich_yougov[100] <- 1

#for each percentile, what share of respondents think a person below that cutoff is poor? 
de18 %>% add_column(cumul_poor_respondi = NA)
for (i in 1:100){
  de18$cumul_poor_respondi[i] <- sum(respondi$poor_inc>=de18$de18_quantile[i], na.rm=T)/sum(is.na(respondi$poor_inc)!=1)
}
#include those who estimated below the 1% cut-off in the bottom percentile count
de18$cumul_poor_respondi[1] <- 1

us19 %>% add_column(cumul_poor_yougov = NA)
for (i in 1:100){
  us19$cumul_poor_yougov[i] <- sum(yougov$Q2>=us19$us19_quantile[i], na.rm=T)/sum(is.na(yougov$Q2)!=1)
    }
#include those who estimated below the 1% cut-off in the bottom percentile count
us19$cumul_poor_yougov[1] <- 1

# round the guesses for rich and poor to deciles
roundUp <- function(x,to=10)
{
  to*(x%/%to + as.logical(x%%to))
}
respondi <- respondi %>% 
  mutate(rich_share_decile = roundUp(rich_share),
         poor_share_decile = roundUp(poor_share),
         richpluspoor_decile = roundUp(richpluspoor),
         rich_share_decile = ifelse(rich_share_decile==0, 10, rich_share_decile), #to collapse "0" into implied "0 to 10" decile
         poor_share_decile = ifelse(poor_share_decile==0, 10, poor_share_decile),
         richpluspoor_decile = ifelse(richpluspoor_decile==0, 10, richpluspoor_decile)
         )

# additional wrangling
respondi <- respondi %>% 
  mutate(above_median = hh_inc > median(hh_inc, na.rm=T),
         high_educ = educ==9 | educ==10
         )

yougov <- yougov %>% 
  mutate(above_median = faminc_new > median(faminc_new, na.rm=T))

#function for finding mode
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

#extract the pretty blue from colorbrewer
myblue <- c("#1F78B4")
```


# Appendix A: Detailed description of samples

```{r setup rounding for in-text numbers 1, include=F}
options(digits=1)
```

The first sample was a convenience sample of `r nrow(pilot)` Americans collected through the Prolific Academic survey platform in 2020. This study was approved by [redacted] IRB. This sample only answered the question about the income it takes to be rich, and not the question about what income qualifies as poor. Table \ref{tab:us_demographics} shows the demographic characteristics of the sample.

The second sample is a representative sample of `r nrow(respondi)` Germans collected through the survey platform Respondi in 2020. This study was declared exempt from the requirement of a full ethics review by the [redacted] IRB. The questions about the rich and the poor, as well as an additional question about the share of the population that the respondents estimate to be rich or poor, were asked as part of a broader survey. Table \ref{tab:german_demographics} shows a comparison between the sample and the German population on targeted demographic variables. 

The third sample is a representative sample of `r nrow(yougov)` Americans collected through the survey company YouGov in 2021. This study was approved by [redacted] IRB. The questions about the rich and poor were asked as part of a broader survey within YouGov's daily Omnibus survey. Table \ref{tab:us_demographics} shows a comparison between this sample, the US convenience sample, and the US population on targeted demographic variables. 

\begin{table}
\begin{tabular}{llll}
` & German population & Survey (unweighted) & Survey (weighted)\\
Gender: male      & 49.2                    & 48.8                   & 49.0                 \\
Gender: female    & 50.8                    & 51.2                   & 51.0                 \\
Region: West      & 84.7                    & 69.4                   & 84.6                 \\
Region: East      & 15.3                    & 30.6                   & 15.4                 \\
Age: 18 - 39      & 31.6                    & 30.7                   & 31.7                 \\
Age: 40 – 59       & 35.2                    & 35.0                   & 35.2                 \\
Age: 60 - 99        & 33.2                    & 34.3                   & 33.1                 \\
Education: low    & 35.8                    & 31.5                   & 35.7                 \\
Education: middle & 30.5                    & 35.2                   & 30.6                 \\
Education: high   & 33.6                    & 33.3                   & 33.7             \\
\hline
\multicolumn{4}{l}{\footnotesize Note: Showing percentages. Population data from the German Mikrocenzus as reported by Kantar Group.}
\end{tabular}
\caption{\label{tab:german_demographics}Demographic characteristics of German sample.}
\end{table}


\begin{table}
\begin{tabular}{llll}
& US population & Convenience sample & Representative sample \\
Gender: male (\%)   & 49    & 51    & 48 \\
Median age          & 39    & 30    & 48 \\
Median household income (\$) & 66k   & 40-50k   & 40-50k \\
College educated (\%) & 33    & 64    & 31 \\
White (\%)        & 72    & 71    & 65 \\
Black (\%)        & 13    & 9   & 12 \\
Hispanic (\%)     & 18    & 9   & 14 \\
\hline
\multicolumn{4}{l}{\footnotesize Note: Population data from the American Community Survey 2019 1-year estimates.} \\
\multicolumn{4}{l}{\footnotesize In the representative sample, race and ethnicity were measured in one variable.}
\end{tabular}
\caption{\label{tab:us_demographics}Demographic characteristics of the United States samples.}
\end{table}

# Appendix B: Percentile-based cumulative graphs of estimates

Tables \ref{fig:rich_cumulative_graphs} and \ref{fig:poor_cumulative_graphs} show estimates of the thresholds for rich and poor in the German and US representative samples, plotted as cumulative distributions of percentiles of the real income distribution. They show, for each percentile of the actual income distribution, what percent of respondents consider a household in that percentile rich or poor respectively.

```{r rich cumulative graphs, fig.cap="\\label{fig:rich_cumulative_graphs}Estimates of threshold for rich, US representative sample"}
options(digits=2)
us_rich_cumulative_graph <- ggplot(us19, aes(x=percentiles, y=cumul_rich_yougov)) +
  geom_area(fill=myblue) +
  theme_bw() +
  ggtitle("US respondents' cumulative estimates of incomes that count as rich", subtitle="Mapped to percentiles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("US household income percentiles") +
  ylab("Cumulative share \nof respondents") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
ge_rich_cumulative_graph <- ggplot(de18, aes(x=percentile, y=cumul_rich_respondi)) +
  geom_area(fill=myblue) +
  theme_bw() +
  ggtitle("German respondents' cumulative estimates of incomes that count as rich", subtitle="Mapped to percentiles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("German household income percentiles") +
  ylab("Cumulative share \nof respondents") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
grid.arrange(us_rich_cumulative_graph, ge_rich_cumulative_graph)
```

```{r poor cumulative graphs, fig.cap="\\label{fig:poor_cumulative_graphs}Estimates of threshold for poor, US representative sample"}
options(digits=2)
us_poor_cumulative_graph <- ggplot(us19, aes(x=percentiles, y=cumul_poor_yougov)) +
  geom_area(fill=myblue) +
  theme_bw() +
  ggtitle("US respondents' cumulative estimates of incomes that count as poor", subtitle="Mapped to percentiles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("US household income percentiles") +
  ylab("Cumulative share \nof respondents") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
ge_poor_cumulative_graph <- ggplot(de18, aes(x=percentile, y=cumul_poor_respondi)) +
  geom_area(fill=myblue) +
  theme_bw() +
  ggtitle("German respondents' cumulative estimates of incomes that count as poor", subtitle="Mapped to percentiles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("German household income percentiles") +
  ylab("Cumulative share \nof respondents") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
grid.arrange(us_poor_cumulative_graph, ge_poor_cumulative_graph)
```

# Appendix C: Perceptions by respondent income

Figures \ref{fig:us_rich_by_own_income} and \ref{fig:ge_rich_by_own_income} break down perceptions of the rich and poor thresholds by the respondents' own income, for the US and German representative samples respectively. 

```{r us rich by respondent income, fig.cap="\\label{fig:us_rich_by_own_income}Estimates of threshold for rich by respondent income, United States representative sample", warning=F}
options(digits=2)
us_rich_threshold_by_inc_graph <- yougov %>% 
  filter(!(is.na(above_median)==T)) %>% 
  ggplot(aes(x=(rich_decile+5), fill=above_median)) +
  geom_bar(aes(y=..count../(sum(..count..)/2)), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent income", label=c("Below median", "Above median")) +
  ggtitle("US respondents' estimates of incomes that count as rich", subtitle="Mapped to deciles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("US household income percentiles") +
  ylab("Share of respondents \nwithin income group") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
us_poor_threshold_by_inc_graph <- yougov %>% 
  filter(!(is.na(above_median)==T)) %>% 
  ggplot(aes(x=(poor_decile+5), fill=above_median)) +
  geom_bar(aes(y=..count../(sum(..count..)/2)), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent income", label=c("Below median", "Above median")) +
  ggtitle("US respondents' estimates of incomes that count as poor", subtitle="Mapped to deciles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("US household income percentiles") +
  ylab("Share of respondents \nwithin income group") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
grid.arrange(us_rich_threshold_by_inc_graph, us_poor_threshold_by_inc_graph)
```

```{r ge rich by respondent income, fig.cap="\\label{fig:ge_rich_by_own_income}Estimates of threshold for rich by respondent income, German representative sample", warning=F}
options(digits=2)
ge_rich_threshold_by_inc_graph <- respondi %>% 
  filter(!(is.na(above_median)==T)) %>% 
  ggplot(aes(x=(rich_decile+5), fill=above_median)) +
  geom_bar(aes(y=..count../(sum(..count..)/2)), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent income", label=c("Below median", "Above median")) +
  ggtitle("German respondents' estimates of incomes that count as rich", subtitle="Mapped to deciles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("German household income percentiles") +
  ylab("Share of respondents \nwithin income group") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
ge_poor_threshold_by_inc_graph <- respondi %>% 
  filter(!(is.na(above_median)==T)) %>% 
  ggplot(aes(x=(poor_decile+5), fill=above_median)) +
  geom_bar(aes(y=..count../(sum(..count..)/2)), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent income", label=c("Below median", "Above median")) +
  ggtitle("German respondents' estimates of incomes that count as poor", subtitle="Mapped to deciles of the actual income distribution") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("German household income percentiles") +
  ylab("Share of respondents \nwithin income group") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
grid.arrange(ge_rich_threshold_by_inc_graph, ge_poor_threshold_by_inc_graph)
```

# Appendix D: Guesses for share rich or poor

The German sample was additionally asked to estimate the share of the public who are rich or poor. The questions immediately followed the threshold questions, and were phrased as:

- "And out of every 100 households in Germany, how many would you say make that much money or more? In other words: what percent of German households earn that much money or more?" 

- "And out of every 100 households in Germany, how many would you say make that little money or less? In other words: what percent of German households earn that little money or less?"

The questions for rich and poor appeared on consecutive survey pages, and responses were constrained to numeric responses between 0 and 100. Figure \ref{fig:share_rich_poor_by_own_income} illustrates respondents' guesses for what share of German households are rich and poor, respectively. The results are displayed separately for respondents with below average and above average incomes. The relationship between respondent income and thresholds for rich and poor no longer holds when respondents are instead asked about shares of the population. 

Figure \ref{fig:share_rich_poor_by_own_educ} additionally displays respondents' guesses for what share of the population is either rich or poor (the sum of the two separate guesses for share rich and share poor), but with the respondents split by formal education rather than income. Logically impossible answers are more common among those with lower formal education, highlighting the concern that questions involving shares of the population are complicated for many respondents.

```{r GE share rich and poor by own income graphs, warning=F, fig.cap="\\label{fig:share_rich_poor_by_own_income}Estimates of share of German households that are rich or poor, by respondent income"}
ge_rich_share_by_inc_graph <- respondi %>% 
  filter(!(is.na(above_median)==T)) %>% 
  ggplot(aes(x=(rich_share_decile), fill=above_median)) +
  geom_bar(aes(y=..prop..), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent income", label=c("Below median", "Above median")) +
  ggtitle("German respondents' estimates:", subtitle="What share of households are rich") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("Respondent guess for share of households that are rich") +
  ylab("Share of respondents \nwithin income group") +
  scale_x_continuous(breaks = c(0,10,20,30,40,50,60,70,80,90,100)) 
ge_poor_share_by_inc_graph <- respondi %>% 
  filter(!(is.na(above_median)==T)) %>% 
  ggplot(aes(x=(poor_share_decile), fill=above_median)) +
  geom_bar(aes(y=..prop..), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent income", label=c("Below median", "Above median")) +
  ggtitle("German respondents' estimates:", subtitle="What share of households are poor") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("Respondent guess for share of households that are poor") +
  ylab("Share of respondents \nwithin income group") 
grid.arrange(ge_rich_share_by_inc_graph, ge_poor_share_by_inc_graph)
```


```{r GE share rich plus poor by own education graph, warning=F, fig.cap="\\label{fig:share_rich_poor_by_own_educ}Estimates of share of German households that are rich or poor, by respondent education"}
ge_poor_share_by_educ_graph <- respondi %>% 
  filter(!(is.na(high_educ)==T)) %>% 
  ggplot(aes(x=(richpluspoor_decile), fill=high_educ)) +
  geom_bar(aes(y=..prop..), position="dodge", color="black", width=9) +
  theme_bw() +
  scale_fill_brewer(palette="Paired", name="Respondent education", label=c("Less than BA", "BA and above")) +
  ggtitle("German respondents' estimates:", subtitle="What share of households are either rich or poor") +
  theme(axis.text = element_text(size = 12), plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
  xlab("Respondent guess for share of households who are either rich or poor\n(imputed from separate guesses for rich and for poor)") +
  ylab("Share of respondents \nwithin education group")
ge_poor_share_by_educ_graph
```


